library("tidyverse")
library("plyr")
library("dplyr")
library("ggplot2")
library("scales")
library("ggpubr")
library("gridExtra")
library("grid")
library("GGally")
library("vcfR") # for extracting genotype data from a vcf file
library("data.table")
library("stringr")
library("janitor")
library("plotly")
library("graphics")
library("quest")
library("cowplot")
#library("gmodels")
#library("rstatix")
#library("freqtables")
#library("broom")
#library("patchwork") # for gathering the plots
#library("fuzzyjoin") # to join tables based on a string in a column
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE,
fig.width = 10,
fig.asp = 0.4,
out.width = "100%")
#fig.width = 6,fig.asp = 0.8,out.width = "100%"
Next we will examine the mean depth for each of our variants. This is essentially the number of reads that have mapped to this position. The output we generated with vcftools is the mean of the read depth across all individuals - it is for both alleles at a position and is not partitioned between the reference and the alternative. First we read in the data.
depth_all <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.ldepth.mean", delim = "\t",
col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)
#a <- ggplot(depth_all, aes(mean_depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
#a + theme_light()
summary(depth_all$mean_depth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.65 17.92 19.09 19.39 20.51 40.85
Again take a moment to look at the data - mean_depth is our column of interest but note that you can also get a an idea of the variance in depth among individuals from the var_depth column. Once again, we will use ggplot to look at the distribution of read depths ### Mean depth by sex
depth_fem <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/female.Q40BIALLDP16HDP40mis.5Chr7/female.Q40BIALLDP16HDP40mis.5Chr7.ldepth.mean", delim = "\t",
col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1) %>%
dplyr::mutate(sex ="Female")
depth_male <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/male.Q40BIALLDP16HDP40mis.5Chr7/male.Q40BIALLDP16HDP40mis.5Chr7.ldepth.mean", delim = "\t",
col_names = c("chr", "pos", "mean_depth", "var_depth"), skip = 1)%>%
dplyr::mutate(sex ="Male")
depth_sex = rbind(depth_fem,depth_male) %>% unite(site, c("chr", "pos"))
ggplot(depth_sex, aes(mean_depth, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Mean site depth, by sex")
ggplot(depth_sex, aes(var_depth, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light()+
ggtitle("Mean site depth variance, by sex")
the site’s depth is somewhat higher in females, compared to males.
the variance is similar.
Next up we will look at the proportion of missingness at each variant. This is a measure of how many individuals lack a genotype at a call site.
miss_fem <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/female.Q40BIALLDP16HDP40mis.5Chr7/female.Q40BIALLDP16HDP40mis.5Chr7.lmiss", delim = "\t",
col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss", "fmiss"), skip = 1) %>%
dplyr::mutate(sex ="Female")
miss_male <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/male.Q40BIALLDP16HDP40mis.5Chr7/male.Q40BIALLDP16HDP40mis.5Chr7.lmiss", delim = "\t",
col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss", "fmiss"), skip = 1) %>%
dplyr::mutate(sex ="Male")
miss_sex = rbind(miss_fem,miss_male) %>% unite(site, c("chr", "pos"))
ggplot(miss_sex, aes(fmiss, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant missingness, by sex")
summary(miss_sex$fmiss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05357 0.41071 0.46237 0.46493 0.51786 0.71429
The first metric we will look at is the (Phred encoded) site quality. This is a measure of how much confidence we have in our variant calls. First of all, we read in the site quality report we generated using vcftools. We will use the read_delim command from the readr package (part of the the tidyverse) because it is more efficient for reading in large datafiles. It also allows us to set our own column names.
Take a look at the data when it is read in. You will see that for each site in our subsampled VCF, we have extracted the site quality score. Now we will plot the distribution of this quality using ggplot. Usually, the geom_density function works best, but you can use geom_histogram too.
qual_fem <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/female.Q40BIALLDP16HDP40mis.5Chr7/female.Q40BIALLDP16HDP40mis.5Chr7.lqual", delim = "\t",
col_names = c("chr", "pos", "qual"), skip = 1) %>%
dplyr::mutate(sex ="Female")
qual_male <-read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/male.Q40BIALLDP16HDP40mis.5Chr7/male.Q40BIALLDP16HDP40mis.5Chr7.lqual", delim = "\t",
col_names = c("chr", "pos", "qual"), skip = 1) %>%
dplyr::mutate(sex ="Male")
qual_sex = rbind(qual_fem,qual_male) %>% unite(site, c("chr", "pos"))
ggplot(qual_sex, aes(qual, fill = sex)) + geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant quality, by sex")
summary(qual_male$qual)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.48 33240.80 48198.10 52338.46 69672.30 191031.00
summary(qual_fem$qual)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.48 33240.80 48198.10 52338.46 69672.30 191031.00
the quality is same for individuals, - its already normalized.
Extract genotypes for each site and individual. The metadata for all samples can be found in here.
vcf <- read.vcfR("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.recode.vcf", verbose = FALSE )
vcf
## ***** Object of Class vcfR *****
## 223 samples
## 7 CHROMs
## 35,169 variants
## Object size: 293.5 Mb
## 0 percent missing data
## ***** ***** *****
# extract the genotype for each site in each individual
gt <- extract.gt(vcf, element = "GT")
gt <- as.data.frame(t(gt)) %>%
rownames_to_column("ID")
#clean the ID names
gt$ID <- sub("_[^_]+$", "", gt$ID)
table <- gt %>%
t() %>%
as.data.frame() %>%
row_to_names(row_number = 1) %>%
dplyr::select(contains(c("son", "dat", "fnd"))) # keep only adults of F0, F1 and F2
# set the families (include only families with at least one adult F2)
family = grep("grndat|grnson",gt$ID, value=TRUE) %>%
str_extract("[^_]+") %>%
unique()
# or, include all F2 samples, but indicate if they have an adult sister (may indicate if the F1 female was fertilized)
#family = grep("grn",gt$ID, value=TRUE) %>%
# str_extract("[^_]+") %>%
# unique()
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/0")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/0")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>%
dplyr::select(contains("grn")) %>%
rownames_to_column("site") %>%
tidyr::pivot_longer(-site) %>%
dplyr::rename(sample = name, gt = value) %>%
mutate(sex = case_when(
grepl("son", sample) ~ "Male",
grepl("dat", sample) ~ "Female")) %>%
dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
grepl("0/1", gt) ~ "Normal",
grepl("0/0", gt) ~ "Exception")) %>%
na.omit()
}
F2_male_00_01 = do.call("rbind", obs) %>% dplyr::select(c("site","sample","site_status"))
### depth
F2_male_00_01_depth = depth_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_00_01, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","mean_depth", "sample", "site_status"))
p_F2_male_00_01_depth_box = F2_male_00_01_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 0/0 x 0/1")
p_F2_male_00_01_depth_dens = F2_male_00_01_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant depth in F2 males, of F1 cross 0/0 x 0/1")
### quality
F2_male_00_01_qual = qual_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_00_01, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","qual", "sample", "site_status"))
p_F2_male_00_01_qual_box = F2_male_00_01_qual %>% ggplot(aes(x = site_status, y = qual)) +
geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 0/0 x 0/1") + theme_light()
p_F2_male_00_01_qual = F2_male_00_01_qual %>% ggplot(aes(qual, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant quality in F2 males, of F1 cross 0/0 x 0/1")
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "1/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "1/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/1")) %>%
dplyr::select(contains("grn")) %>%
rownames_to_column("site") %>%
tidyr::pivot_longer(-site) %>%
dplyr::rename(sample = name, gt = value) %>%
mutate(sex = case_when(
grepl("son", sample) ~ "Male",
grepl("dat", sample) ~ "Female")) %>%
dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
grepl("0/1", gt) ~ "Normal",
grepl("1/1", gt) ~ "Exception")) %>%
na.omit()
}
F2_male_11_01 = do.call("rbind", obs) %>% dplyr::select(c("site","sample","site_status"))
### depth
F2_male_11_01_depth = depth_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_11_01, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","mean_depth", "sample", "site_status"))
p_F2_male_11_01_depth_box = F2_male_11_01_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 1/1 x 0/1")
p_F2_male_11_01_depth_dens = F2_male_11_01_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant depth in F2 males, of F1 cross 1/1 x 0/1")
### quality
F2_male_11_01_qual = qual_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_11_01, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","qual", "sample", "site_status"))
p_F2_male_11_01_qual_box = F2_male_11_01_qual %>% ggplot(aes(x = site_status, y = qual)) +
geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 1/1 x 0/1") + theme_light()
p_F2_male_11_01_qual = F2_male_11_01_qual %>% ggplot(aes(qual, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant quality in F2 males, of F1 cross 1/1 x 0/1")
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "0/0")) %>%
dplyr::select(contains("grn")) %>%
rownames_to_column("site") %>%
tidyr::pivot_longer(-site) %>%
dplyr::rename(sample = name, gt = value) %>%
mutate(sex = case_when(
grepl("son", sample) ~ "Male",
grepl("dat", sample) ~ "Female")) %>%
dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
grepl("0/0", gt) ~ "Normal",
grepl("0/1", gt) ~ "Exception")) %>%
na.omit()
}
F2_male_01_00 = do.call("rbind", obs) %>% dplyr::select(c("site","sample","site_status"))
### depth
F2_male_01_00_depth = depth_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_01_00, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","mean_depth", "sample", "site_status"))
p_F2_male_01_00_depth_box = F2_male_01_00_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 0/0")
p_F2_male_01_00_depth_dens = F2_male_01_00_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 0/0")
### quality
F2_male_01_00_qual = qual_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_01_00, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","qual", "sample", "site_status"))
p_F2_male_01_00_qual_box = F2_male_01_00_qual %>% ggplot(aes(x = site_status, y = qual)) +
geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 0/0") + theme_light()
p_F2_male_01_00_qual = F2_male_01_00_qual %>% ggplot(aes(qual, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 0/0")
# define a list to put all the data frames in
obs <- list()
for (fam in family) {
obs[[fam]] <- table %>%
dplyr::select(starts_with(fam)) %>%
dplyr::filter_at(vars(matches("_fnd")), all_vars(. == "0/1")) %>% # force F0 female to be homo, like her son
dplyr::filter_at(vars(matches("_son")), all_vars(. == "0/1")) %>%
dplyr::filter_at(vars(matches("_dat")), all_vars(. == "1/1")) %>%
dplyr::select(contains("grn")) %>%
rownames_to_column("site") %>%
tidyr::pivot_longer(-site) %>%
dplyr::rename(sample = name, gt = value) %>%
mutate(sex = case_when(
grepl("son", sample) ~ "Male",
grepl("dat", sample) ~ "Female")) %>%
dplyr::filter(sex == "Male") %>%
dplyr::mutate(site_status = case_when(
grepl("1/1", gt) ~ "Normal",
grepl("0/1", gt) ~ "Exception")) %>%
na.omit()
}
F2_male_01_11 = do.call("rbind", obs) %>% dplyr::select(c("site","sample","site_status"))
### depth
F2_male_01_11_depth = depth_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_01_11, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","mean_depth", "sample", "site_status"))
p_F2_male_01_11_depth_box = F2_male_01_11_depth %>% ggplot(aes(x = site_status, y = mean_depth)) +
geom_boxplot()+ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 1/1")
p_F2_male_01_11_depth_dens = F2_male_01_11_depth %>% ggplot(aes(mean_depth, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant depth in F2 males, of F1 cross 0/1 x 1/1")
### quality
F2_male_01_11_qual = qual_sex %>%
dplyr::filter(sex =="Male") %>%
full_join(F2_male_01_11, by = "site") %>%
na.omit() %>%
dplyr::select(c("site","qual", "sample", "site_status"))
p_F2_male_01_11_qual_box = F2_male_01_11_qual %>% ggplot(aes(x = site_status, y = qual)) +
geom_boxplot() + ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 1/1") + theme_light()
p_F2_male_01_11_qual = F2_male_01_11_qual %>% ggplot(aes(qual, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant quality in F2 males, of F1 cross 0/1 x 1/1")
pool all sites together
F2_male = rbind(mutate(F2_male_00_01_qual, cross = "male_00_01"),
mutate(F2_male_11_01_qual, cross = "male_11_01"),
mutate(F2_male_01_00_qual, cross = "male_01_00"),
mutate(F2_male_01_11_qual, cross = "male_01_11")) %>%
dplyr::select(c("site", "qual", "site_status","cross"))
F2_male %>% ggplot(aes(qual, fill = site_status)) +
geom_density(colour = "black", alpha = 0.5) + theme_light() +
ggtitle("Variant quality in F2 males") +
facet_wrap(~cross, ncol = 1,scales = "free_y" ) +
scale_x_continuous(n.breaks = 12) +
theme(axis.text.x=element_text(angle=90,hjust=1))
plot quality in each cross:
legend <- get_legend(p_F2_male_00_01_qual) # get the legend of the first one plot
# here the plots in a grid
prow <- plot_grid( p_F2_male_00_01_qual + theme(legend.position="none"),
# here you add the percentage
p_F2_male_11_01_qual + theme(legend.position="none")+ scale_y_continuous(),
p_F2_male_01_00_qual + theme(legend.position="none")+ scale_y_continuous(),
p_F2_male_01_11_qual + theme(legend.position="none")+ scale_y_continuous(),
align = 'v',
#labels = c("A", "B"),
hjust = -1,
nrow = 4)
# here you add the legend
plot_grid( prow, legend, rel_widths = c(1, .2))
# Draw density plots
grid.arrange(arrangeGrob(p_F2_male_00_01_qual, p_F2_male_11_01_qual, p_F2_male_01_00_qual, p_F2_male_01_11_qual,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))
# Draw box plots
grid.arrange(arrangeGrob(p_F2_male_00_01_qual_box, p_F2_male_11_01_qual_box, p_F2_male_01_00_qual_box, p_F2_male_01_11_qual_box ,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))
plot depth in each cross:
# Draw density plots
grid.arrange(arrangeGrob(p_F2_male_00_01_depth_dens, p_F2_male_11_01_depth_dens, p_F2_male_01_00_depth_dens, p_F2_male_01_11_depth_dens,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))
# Draw box plots
grid.arrange(arrangeGrob(p_F2_male_00_01_depth_box, p_F2_male_11_01_depth_box, p_F2_male_01_00_depth_box, p_F2_male_01_11_depth_box ,ncol = 1), heights = c(10, 1),top = grid::textGrob("all crosses", x = 0, hjust = 0))
As well as a our per variant statistics we generated earlier, we also calculated some individual metrics too. WE can look at the distribution of these to get an idea whether some of our individuals have not sequenced or mapped as well as others. This is good practice to do with a new dataset. A lot of these statistics can be compared to other measures generated from the data (i.e. principal components as a measure of population structure) to see if they drive any apparent patterns in the data.
First we will look at the distribution of mean depth among individuals. We read the data in with read_delim:
# NO filtering
ind_depth <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.idepth", delim = "\t",
col_names = c("ind", "nsites", "depth"), skip = 1) %>%
mutate(sex = case_when(
grepl("son", ind) ~ "Male",
grepl("dat|fnd", ind) ~ "Female")) %>%
na.omit()
ind_depth$ind <- sub("_[^_]+$", "", ind_depth$ind)
#Then we plot the distribution as a histogram using ggplot and geom_hist.
ggplot(ind_depth, aes(depth)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)+
theme_light()
ggplot(ind_depth, aes(depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)+
theme_light()
p_ind_depth = ind_depth %>% filter(grepl("grnson|grndat", ind)) %>%
ggplot(aes(y = depth, x = ind, color = sex, label = ind)) + geom_point() +
theme_classic() + theme(axis.text.x = element_blank())
ggplotly(p_ind_depth)
#p_ind_depth +
# geom_label() +
# geom_text(angle = 45,hjust = 0, nudge_x = 0.05)
summary(ind_depth$depth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.511 12.495 17.810 19.245 25.794 38.771
individuals with high depth (>30)
ind_depth %>% dplyr::filter(depth > 30)
## # A tibble: 15 × 4
## ind nsites depth sex
## <chr> <dbl> <dbl> <chr>
## 1 133_133_fnd 35169 34.9 Female
## 2 476_477b_grnson 35169 30.0 Male
## 3 46_47c_grndat 35169 32.4 Female
## 4 302_303_dat 35169 34.6 Female
## 5 478_478_fnd 35169 34.3 Female
## 6 498_499_dat 35169 38.8 Female
## 7 300_300a_son 35169 32.0 Male
## 8 43_44_dat 35169 30.7 Female
## 9 284_284_fnd 35169 37.9 Female
## 10 564_565-1b_grnson 35169 34.0 Male
## 11 458_459a_grnson 35168 31.6 Male
## 12 476_476_fnd 35169 30.4 Female
## 13 133_133a_son 35169 30.8 Male
## 14 478_479-1_dat 35169 31.6 Female
## 15 48_49_dat 35169 30.9 Female
individuals with low depth (<10)
ind_depth %>% filter(depth < 10 )
## # A tibble: 17 × 4
## ind nsites depth sex
## <chr> <dbl> <dbl> <chr>
## 1 322_322_fnd 34061 7.42 Female
## 2 300_301_dat 33914 7.42 Female
## 3 596_597a_grnson 33837 8.29 Male
## 4 266_266a_son 32842 6.51 Male
## 5 110_110_fnd 33728 7.32 Female
## 6 187_188a_grnson 33632 7.15 Male
## 7 498_498_fnd 34574 8.96 Female
## 8 300_301a_grnson 34099 8.67 Male
## 9 177_178b_grnson 33963 7.04 Male
## 10 240_241c_grnson 34833 9.36 Male
## 11 266_266_fnd 33878 7.21 Female
## 12 478_479-2b_grnson 34677 9.02 Male
## 13 564_565-2a_grndat 34648 9.74 Female
## 14 177_178a_grndat 33541 6.62 Female
## 15 43_43_fnd 34397 7.87 Female
## 16 478_479-2a_grndat 34710 8.77 Female
## 17 502_502_fnd 34632 9.61 Female
Next we will look at the proportion of missing data per individual. We read in the data below:
ind_miss <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/vcf_filter/Q40BIALLDP16HDP40mis.5Chr7/Q40BIALLDP16HDP40mis.5Chr7.imiss", delim = "\t", col_names = c("ind", "ndata", "nfiltered", "nmiss", "fmiss"), skip = 1) %>%
mutate(sex = case_when(
grepl("son", ind) ~ "Male",
grepl("dat|fnd", ind) ~ "Female")) %>%
na.omit()
ind_miss$ind <- sub("_[^_]+$", "", ind_miss$ind)
p_ind_miss = ind_miss %>% filter(grepl("grnson|grndat", ind)) %>%
ggplot(aes(y = fmiss, x = ind, color = sex)) + geom_point() + theme_classic()
ggplotly(p_ind_miss)
summary(ind_miss$fmiss)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0967 0.1677 0.3899 0.4512 0.7280 0.9574
individuals with high missingness (>0.7)
ind_miss %>% dplyr::filter(fmiss > 0.7)
## # A tibble: 39 × 6
## ind ndata nfiltered nmiss fmiss sex
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 596_596_fnd 35169 0 29393 0.836 Female
## 2 57_58b_grnson 35169 0 25086 0.713 Male
## 3 57_57_fnd 35169 0 26129 0.743 Female
## 4 57_57a_son 35169 0 29440 0.837 Male
## 5 43_43a_son 35169 0 25215 0.717 Male
## 6 502_503b_grnson 35169 0 26472 0.753 Male
## 7 426_426a_son 35169 0 29759 0.846 Male
## 8 412_413a_grnson 35169 0 26609 0.757 Male
## 9 322_323_dat 35169 0 29327 0.834 Female
## 10 322_322_fnd 35169 0 32729 0.931 Female
## # … with 29 more rows
individuals with low missingness (<0.15)
ind_miss %>% filter(fmiss < 0.15 )
## # A tibble: 26 × 6
## ind ndata nfiltered nmiss fmiss sex
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 63_64_dat 35169 0 3509 0.0998 Female
## 2 63_63a_son 35169 0 4700 0.134 Male
## 3 400_400a_son 35169 0 4565 0.130 Male
## 4 57_58_dat 35169 0 3484 0.0991 Female
## 5 338_339a_grndat 35169 0 3783 0.108 Female
## 6 338_338_fnd 35169 0 3492 0.0993 Female
## 7 46_47_dat 35169 0 5127 0.146 Female
## 8 110_111c_grnson 35169 0 4033 0.115 Male
## 9 177_177a_son 35169 0 3901 0.111 Male
## 10 177_178_dat 35169 0 4685 0.133 Female
## # … with 16 more rows
sample_yield <- read_delim("/Users/nuriteliash/Documents/GitHub/varroa-linkage-map/data/sample_yield.csv", delim = ",") %>%
mutate(sex = case_when(
grepl("son", ind) ~ "Male",
grepl("dat|fnd", ind) ~ "Female")) %>%
na.omit()
p_sample_yield = sample_yield %>% filter(grepl("grnson|grndat", ind)) %>%
ggplot(aes(y = conc, x = ind, color = sex, label = ind)) + geom_point() +
theme_classic() + theme(axis.text.x = element_blank())
ggplotly(p_sample_yield)
sample_yield %>% ggplot(aes(x = sex, y = conc)) +
geom_boxplot() + ggtitle("Sample yield")
males have lower yield compare to females
samples_param = sample_yield %>% left_join(ind_miss, by = "ind") %>% left_join(ind_depth, by ="ind") %>%
dplyr::select(c("ind", "sex", "Extraction_date","conc","fmiss","depth")) %>%
mutate(conc = as.numeric(conc)) %>%
mutate(fmiss = as.numeric(fmiss)) %>%
mutate(depth = as.numeric(depth))
df = samples_param %>%
dplyr::select(c("conc","fmiss","depth"))
pairs(~ conc + fmiss + depth, data = df, upper.panel = NULL)
No correlation between sample’s depth, or missingnes, And the
concentration upon extraction.
samples_param %>% ggscatter(x = "conc", y = "fmiss", color = "sex")
samples_param %>% ggscatter(x = "conc", y = "depth", color = "sex")
lower concentration for male samples
abnorm_males = tibble(ind = c("240_241c_grnson", "400_401a_grnson","412_413a_grnson", "426_427b_grnson", "458_459a_grnson", "46_47d_grnson"), sex = "Male", normality = "abnormal")
sample_normality = samples_param %>%
left_join(abnorm_males, by = "ind") %>%
dplyr::select(-sex.y) %>%
replace(is.na(.), "normal") %>%
dplyr::rename(sex = sex.x) %>%
unite("type", sex,normality, remove = FALSE)
#sample_normality %>%
# dplyr::filter(sex =="Male") %>%
#pivot_longer(c(conc, fmiss, depth)) %>%
#ggplot(aes( y = value, color = type )) +
# geom_boxplot()+
#ggtitle("Sample quality") +
# facet_wrap(~name, scales = "free_y") +
#theme_bw() +
#theme(axis.text.x = element_blank())
sample_normality %>%
# dplyr::filter(sex =="Male") %>%
pivot_longer(c(conc, fmiss, depth)) %>%
ggplot(aes( y = value, x = type, color = type )) +
geom_boxplot()+
ggtitle("Sample quality") +
facet_wrap(~name, scales = "free_y") +
theme_bw() +
theme(axis.text.x = element_blank()) +theme(axis.text.x = element_blank(),
axis.title.x =element_blank() ) +
geom_jitter(alpha = 0.5)
p = sample_normality %>%
#dplyr::filter(sex =="Male") %>%
ggscatter(x = "depth", y = "fmiss", color = "type")
ggplotly(p)
abnormal males dont have exceptional yield, depth or missingness.
dp = extract.gt(vcf, element = "DP", as.numeric =TRUE)
dp <- as.data.frame(t(dp)) %>%
rownames_to_column("ind")
#clean the ID names
dp$ind <- sub("_[^_]+$", "", dp$ind)
table_dp <- dp %>%
t() %>%
as.data.frame() %>%
row_to_names(row_number = 1) %>%
dplyr::select(contains(c("son", "dat", "fnd"))) # keep only adults of F0, F1 and F2
summary(depth_all$mean_depth)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.65 17.92 19.09 19.39 20.51 40.85
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 14.65 17.92 19.09 19.39 20.51 40.85
based on the quantiles values of the mean depth of individuals, I
count the sites with:
- “low coverage” (<Q1) 17.92.
- “high coverage” (>Q3) 20.51.
then we calculate the frequency of low and high coverage sites, per
individual
dp_df <- dp %>%
column_to_rownames("ind") %>%
mutate(total = rowSums(across(where(is.numeric)) > 0, na.rm = TRUE)) %>%
mutate(low_dp = rowSums(across(where(is.numeric)) < 17.92, na.rm = TRUE)) %>%
mutate(high_dp = rowSums(across(where(is.numeric)) > 20.51, na.rm = TRUE)) %>%
mutate(low_freq = low_dp/total) %>%
mutate(high_freq = high_dp/total) %>%
dplyr::select(-contains("NW")) %>%
rownames_to_column("ind") %>%
inner_join(sample_normality, by = "ind")
p_box_low = dp_df %>%
# dplyr::filter(sex =="Male") %>%
ggplot(aes(x = type, y = low_freq, color = type ,text = ind)) +
geom_boxplot()+ggtitle("Frequency of low coverage sites")+
theme(axis.text.x = element_blank(),
axis.title.x =element_blank() ) +
geom_jitter()+ theme_bw()
p_box_high = dp_df %>%
# dplyr::filter(sex =="Male") %>%
ggplot(aes(x = type, y = high_freq, color = type ,text = ind)) +
geom_boxplot()+ggtitle("Frequency of HIGH coverage sites")+
theme(axis.text.x = element_blank(),
axis.title.x =element_blank() ) +
geom_jitter() + theme_bw()
ggplotly(p_box_low, tooltip = "text")
ggplotly(p_box_high, tooltip = "text")
correlation btw frequcny of coverage sites, and mean_depth
dp_df %>%
dplyr::filter(sex =="Male") %>%
ggscatter(x = "depth", y = "low_freq", color = "type")